Initialise the libraries


In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from math import log, sqrt
from sklearn import linear_model

Load the data


In [2]:
dtype_dict = {'bathrooms':float, 'waterfront':int, 'sqft_above':int, 'sqft_living15':float, 'grade':int, 'yr_renovated':int, 'price':float, 'bedrooms':float, 'zipcode':str, 'long':float, 'sqft_lot15':float, 'sqft_living':float, 'floors':str, 'condition':int, 'lat':float, 'date':str, 'sqft_basement':int, 'yr_built':int, 'id':str, 'sqft_lot':int, 'view':int}

In [3]:
sales = pd.read_csv('../datasets/kc_house_data.csv', dtype=dtype_dict)

testing = pd.read_csv('../datasets/wk3_kc_house_test_data.csv', dtype=dtype_dict)
training = pd.read_csv('../datasets/wk3_kc_house_train_data.csv', dtype=dtype_dict)
validation = pd.read_csv('../datasets/wk3_kc_house_valid_data.csv', dtype=dtype_dict)

Custom numpy functions


In [4]:
def get_numpy_data(data_sframe, features, output):
    data_sframe['constant'] = 1 # add a constant column to an SFrame
    # prepend variable 'constant' to the features list
    features = ['constant'] + features
    # select the columns of data_SFrame given by the ‘features’ list into the SFrame ‘features_sframe’
    features_sframe = data_sframe[features]
    # this will convert the features_sframe into a numpy matrix with GraphLab Create >= 1.7!!
    features_matrix = np.matrix(features_sframe)
    # assign the column of data_sframe associated with the target to the variable ‘output_sarray’
    output_sarray = data_sframe[output]
    # this will convert the SArray into a numpy array:
    output_array = np.array(output_sarray) # GraphLab Create>= 1.7!!
    return(features_matrix, output_array)

In [5]:
def predict_outcome(feature_matrix, weights):
    predictions = np.dot(feature_matrix, weights)
    return(predictions)

In [6]:
def normalize_features(features):
    norms = np.linalg.norm(features, axis=0)
    normalized_features = features / norms
    return (normalized_features, norms)

Simple model


In [7]:
#simple_features = ['sqft_living','bedrooms']
#my_output= 'price'
#(simple_feature_matrix, output) = get_numpy_data(sales, simple_features, my_output)
#initial_weights = np.array([1., 4., 1.])
#(normalized_simple_feature_matrix, norms) = normalize_features(simple_feature_matrix)
#print(simple_feature_matrix)

In [8]:
#prediction = predict_outcome(normalized_simple_feature_matrix, initial_weights)
#print (prediction)
#print (output)
#print (output - prediction)

In [9]:
# Not working for some reason and not sure what this part of the assignment does
#i = 0
#ro=[]
#for feature in simple_features:
    #print (simple_feature_matrix[:,i])
    #ro[i] = SUM[ [feature_i]*(output - prediction + w[i]*[feature_i]) ]
    #ro.append(sum(simple_feature_matrix[:,i] * (output - prediction + initial_weights[i] * simple_feature_matrix[:,i])))
    #i = i + 1
#print (ro)

Single Coordinate Descent Step


In [22]:
def lasso_coordinate_descent_step(i, feature_matrix, output, weights, l1_penalty):
    # compute prediction
    prediction = predict_outcome(feature_matrix, weights)
    # compute ro[i] = SUM[ [feature_i]*(output - prediction + weight[i]*[feature_i]) ]
    feature_i = feature_matrix[:,i]
    #ro_i = (feature_i * (output - prediction + np.dot(np.transpose(weights[i]),feature_i))).sum()
    ro_i = (np.transpose(feature_i) * (output - prediction  +  weights[i] * feature_i)).sum()
    
    if i == 0: # intercept -- do not regularize
        new_weight_i = ro_i
    elif ro_i < -l1_penalty/2.:
        new_weight_i = ro_i + l1_penalty/2.
    elif ro_i > l1_penalty/2.:
        new_weight_i = ro_i - l1_penalty/2.
    else:
        new_weight_i = 0.
    
    return new_weight_i

In [23]:
# should print 0.425558846691
import math
print (lasso_coordinate_descent_step(1, np.array([[3./math.sqrt(13),1./math.sqrt(10)],
                   [2./math.sqrt(13),3./math.sqrt(10)]]), np.array([1., 1.]), np.array([1., 4.]), 0.1))


0.425558846691

In [24]:
def lasso_cyclical_coordinate_descent(feature_matrix, output, initial_weights, l1_penalty, tolerance):
    iterations = 0
    
    while True:
        max_change = 0
        for i in range(len(initial_weights)):
            old_weight = initial_weights[i]
            initial_weights[i] = lasso_coordinate_descent_step(i, feature_matrix, output, initial_weights, l1_penalty)

            change = abs(old_weight-initial_weights[i])

            if change > max_change:
                max_change = change
                print (max_change)
                
        iterations += 1
        if max_change < tolerance:
            print ('Done in: ', iterations, ' iterations.')
            break
        
    return initial_weights

In [25]:
simple_features = ['sqft_living', 'bedrooms']
my_output= 'price'

(simple_feature_matrix, output) = get_numpy_data(sales, simple_features, my_output)
(normalized_simple_feature_matrix, norms) = normalize_features(simple_feature_matrix)
initial_weights = np.array([0., 0., 0.])
L1_penalty = 1e7
tolerance = 1.0

In [26]:
print (output)
print (normalized_simple_feature_matrix)


[ 221900.  538000.  180000. ...,  402101.  400000.  325000.]
[[ 0.00680209  0.00353021  0.00583571]
 [ 0.00680209  0.00768869  0.00583571]
 [ 0.00680209  0.00230361  0.00389048]
 ..., 
 [ 0.00680209  0.00305154  0.00389048]
 [ 0.00680209  0.00478673  0.00583571]
 [ 0.00680209  0.00305154  0.00389048]]

In [27]:
opt_weights = lasso_cyclical_coordinate_descent(normalized_simple_feature_matrix, output, initial_weights, L1_penalty, tolerance)
print (opt_weights)


1.71607878413e+12
3.39274526745e+16
6.46591545971e+20
1.34707602139e+25
2.66321443211e+29
5.07557095043e+33
1.05741870038e+38
2.09055220264e+42
3.98418764273e+46
8.30045439299e+50
1.64102764666e+55
3.1274808937e+59
6.51563501796e+63
1.28816287564e+68
2.45498897581e+72
5.11459947579e+76
1.01117345437e+81
1.92710078053e+85
4.0148239927e+89
7.93744156233e+93
1.51272264556e+98
3.15152961033e+102
6.23067964111e+106
1.18744687647e+111
2.4738665761e+115
4.89091711528e+119
9.32114084843e+123
1.94191919259e+128
3.83923931359e+132
7.31684662597e+136
1.52435470328e+141
3.01370032647e+145
5.7435291901e+149
1.19657773108e+154
2.36567426927e+158
4.50851702174e+162
9.39281561882e+166
1.85699112129e+171
3.53906545294e+175
7.37310940651e+179
1.45768843549e+184
2.77807186261e+188
5.7876939702e+192
1.14424649132e+197
2.18071221808e+201
4.54318519446e+205
8.98202936252e+209
1.71180085083e+214
3.56627904264e+218
7.0506531662e+222
1.34371795086e+227
2.7994338039e+231
5.53457443375e+235
1.0547827047e+240
2.19748077148e+244
4.34449311841e+248
---------------------------------------------------------------------------
KeyboardInterrupt                         Traceback (most recent call last)
<ipython-input-27-e34bb2fcdaed> in <module>()
----> 1 opt_weights = lasso_cyclical_coordinate_descent(normalized_simple_feature_matrix, output, initial_weights, L1_penalty, tolerance)
      2 print (opt_weights)

<ipython-input-24-f1a924951706> in lasso_cyclical_coordinate_descent(feature_matrix, output, initial_weights, l1_penalty, tolerance)
      6         for i in range(len(initial_weights)):
      7             old_weight = initial_weights[i]
----> 8             initial_weights[i] = lasso_coordinate_descent_step(i, feature_matrix, output, initial_weights, l1_penalty)
      9 
     10             change = abs(old_weight-initial_weights[i])

<ipython-input-22-126ced5e48ce> in lasso_coordinate_descent_step(i, feature_matrix, output, weights, l1_penalty)
      5     feature_i = feature_matrix[:,i]
      6     #ro_i = (feature_i * (output - prediction + np.dot(np.transpose(weights[i]),feature_i))).sum()
----> 7     ro_i = (np.transpose(feature_i) * (output - prediction  +  weights[i] * feature_i)).sum()
      8 
      9     if i == 0: # intercept -- do not regularize

/home/weenkus/anaconda3/lib/python3.5/site-packages/numpy/matrixlib/defmatrix.py in __mul__(self, other)
    341         if isinstance(other, (N.ndarray, list, tuple)) :
    342             # This promotes 1-D vectors to row vectors
--> 343             return N.dot(self, asmatrix(other))
    344         if isscalar(other) or not hasattr(other, '__rmul__') :
    345             return N.dot(self, other)

KeyboardInterrupt: 

In [ ]:
RSS = (np.array(np.dot(normalized_simple_feature_matrix, opt_weights) - output) ** 2).sum()
print (RSS)

In [ ]:


In [ ]: